package com.gujiayue.common.utils;

import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import okhttp3.MediaType;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.RequestBody;
import okhttp3.Response;

public class RequestUtils {

	private static final MediaType JSON = MediaType.parse("application/json; charset=utf-8");

	private static OkHttpClient client = new OkHttpClient();

	public static Response post(String url, String json) throws IOException {
		RequestBody body = RequestBody.create(JSON, json);
		Request request = new Request.Builder().url(url).post(body).build();
		Response response = client.newCall(request).execute();
		return response;
	}

	public static Response get(String url) throws IOException {
		Request request = new Request.Builder().url(url).build();
		Response response = client.newCall(request).execute();
		return response;
	}

	
	/**
     * 删除所有br
     * @param cs 字符序列
     * @return 删除html标签后的字符序列
     */
    public static String deleteNotBrHtml(String cs){
        return Pattern.compile("<br([^>]*)>").matcher(cs).replaceAll("").replaceAll("</br>","");
    }
	
	public static void main(String[] args) throws IOException {
		
		
		//省"http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html"
		//市"http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/"
//http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/13/1301.html
		
		String url="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html";
		
		
		Response response = get(url);

		String str = new String(response.body().bytes(), "GBK");

		// 只保留 img p span 标签
//		String reg_table = "(?!<(table).*?>)<.*?>";
//		Pattern p_html_table = Pattern.compile(reg_table, Pattern.CASE_INSENSITIVE);
//		Matcher m_html_table = p_html_table.matcher(str);
//		str = m_html_table.replaceAll("");

//		String reg_a = "(?!<(a|/a).*?>)<.*?>";
//		Pattern p_html_a = Pattern.compile(reg_a, Pattern.CASE_INSENSITIVE);
//		Matcher m_html_a = p_html_a.matcher(str);
//		str = m_html_a.replaceAll("");
//		
		str =deleteNotBrHtml(str);
		Pattern p = Pattern.compile("<a(?:(?!href=).)*href=(['\"\"]?)(?<url>[^\"\"\\s>]*)\\1[^>]*>(?<text>(?:(?!</?a\\b).)*)</a>", Pattern.CASE_INSENSITIVE);
		Matcher m = p.matcher(str);
		
		while (m.find()) {
			System.out.format( m.group("url")+"--->"+m.group("text"));
		}

		
		//http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html

		//13.html
		//http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/13.html
		
		//13/1303.html
		//http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/13/1303.html
		
		//03/130321.html
		//http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/13/03/130321.html
		
		//21/130321202.html
		//http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/13/03/21/130321102.html
	}
}
